In [1]:
import pandas as pd
In [2]:
A  = pd.read_csv("C:/Users/cheshi/Desktop/NEW PROJECTS FOR DELOITTE/faces with no emoji/weatherAUS (1).csv")
In [3]:
A
Out[3]:
Date Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am ... Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday RISK_MM RainTomorrow
0 2008-12-01 Albury 13.4 22.9 0.6 NaN NaN W 44.0 W ... 22.0 1007.7 1007.1 8.0 NaN 16.9 21.8 No 0.0 No
1 2008-12-02 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW ... 25.0 1010.6 1007.8 NaN NaN 17.2 24.3 No 0.0 No
2 2008-12-03 Albury 12.9 25.7 0.0 NaN NaN WSW 46.0 W ... 30.0 1007.6 1008.7 NaN 2.0 21.0 23.2 No 0.0 No
3 2008-12-04 Albury 9.2 28.0 0.0 NaN NaN NE 24.0 SE ... 16.0 1017.6 1012.8 NaN NaN 18.1 26.5 No 1.0 No
4 2008-12-05 Albury 17.5 32.3 1.0 NaN NaN W 41.0 ENE ... 33.0 1010.8 1006.0 7.0 8.0 17.8 29.7 No 0.2 No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
142188 2017-06-20 Uluru 3.5 21.8 0.0 NaN NaN E 31.0 ESE ... 27.0 1024.7 1021.2 NaN NaN 9.4 20.9 No 0.0 No
142189 2017-06-21 Uluru 2.8 23.4 0.0 NaN NaN E 31.0 SE ... 24.0 1024.6 1020.3 NaN NaN 10.1 22.4 No 0.0 No
142190 2017-06-22 Uluru 3.6 25.3 0.0 NaN NaN NNW 22.0 SE ... 21.0 1023.5 1019.1 NaN NaN 10.9 24.5 No 0.0 No
142191 2017-06-23 Uluru 5.4 26.9 0.0 NaN NaN N 37.0 SE ... 24.0 1021.0 1016.8 NaN NaN 12.5 26.1 No 0.0 No
142192 2017-06-24 Uluru 7.8 27.0 0.0 NaN NaN SE 28.0 SSE ... 24.0 1019.4 1016.5 3.0 2.0 15.1 26.0 No 0.0 No

142193 rows × 24 columns

In [4]:
A.isna().sum()
Out[4]:
Date                 0
Location             0
MinTemp            637
MaxTemp            322
Rainfall          1406
Evaporation      60843
Sunshine         67816
WindGustDir       9330
WindGustSpeed     9270
WindDir9am       10013
WindDir3pm        3778
WindSpeed9am      1348
WindSpeed3pm      2630
Humidity9am       1774
Humidity3pm       3610
Pressure9am      14014
Pressure3pm      13981
Cloud9am         53657
Cloud3pm         57094
Temp9am            904
Temp3pm           2726
RainToday         1406
RISK_MM              0
RainTomorrow         0
dtype: int64
In [5]:
for i in A.columns:
    if(A[i].dtypes == "object"):
        x = A[i].mode()[0]
        A[i] = A[i].fillna(x)
    else:
        x = A[i].mean()
        A[i] = A[i].fillna(x)
In [6]:
con = []
for i in A.columns:
    if(A[i].dtypes != "object"):
        con.append(i)
In [7]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
A1 = pd.DataFrame(ss.fit_transform(A[con]),columns = con)
In [8]:
out = []
for i in con:
    Q = list(A1[(A1[i]>3)|(A1[i]<-3)].index)
    out.extend(Q)

from numpy import unique
outliers=unique(out)
In [9]:
outliers
Out[9]:
array([     8,     12,     51, ..., 142014, 142126, 142127])
In [10]:
A = A.drop(index = outliers,axis = 0)
In [11]:
A.shape
Out[11]:
(131761, 24)
In [12]:
A.index = range(0,131761)
In [ ]:
 
In [13]:
con = []
cat = []
for i in A.columns:
    if(A[i].dtypes == "object"):
        cat.append(i)
    else:
        con.append(i)
        
A1 = pd.get_dummies(A[cat])
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
A2 = pd.DataFrame(ss.fit_transform(A[con]),columns=con)
Xnew = A2.join(A1)
In [14]:
pip install statsmodels
Requirement already satisfied: statsmodels in d:\users\cheshi\anaconda4\lib\site-packages (0.14.0)
Requirement already satisfied: scipy!=1.9.2,>=1.4 in d:\users\cheshi\anaconda4\lib\site-packages (from statsmodels) (1.11.1)
Requirement already satisfied: pandas>=1.0 in d:\users\cheshi\anaconda4\lib\site-packages (from statsmodels) (2.0.3)
Requirement already satisfied: numpy>=1.18 in d:\users\cheshi\anaconda4\lib\site-packages (from statsmodels) (1.25.0)
Requirement already satisfied: patsy>=0.5.2 in d:\users\cheshi\anaconda4\lib\site-packages (from statsmodels) (0.5.3)
Requirement already satisfied: packaging>=21.3 in d:\users\cheshi\anaconda4\lib\site-packages (from statsmodels) (23.0)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=1.0->statsmodels) (2.8.2)
Requirement already satisfied: tzdata>=2022.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=1.0->statsmodels) (2023.3)
Requirement already satisfied: pytz>=2020.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=1.0->statsmodels) (2022.7)
Requirement already satisfied: six in d:\users\cheshi\anaconda4\lib\site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [15]:
A['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
A['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)
In [16]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (8,5))
A.RainTomorrow.value_counts(normalize = True).plot(kind='bar', color= ['skyblue','navy'], alpha = 0.9, rot=0)
plt.title('RainTomorrow Indicator No(0) and Yes(1) in the Imbalanced Dataset')
plt.show()
In [17]:
from sklearn.utils import resample

no = A[A.RainTomorrow == 0]
yes = A[A.RainTomorrow == 1]
yes_oversampled = resample(yes, replace=True, n_samples=len(no), random_state=123)
oversampled = pd.concat([no, yes_oversampled])

fig = plt.figure(figsize = (8,5))
oversampled.RainTomorrow.value_counts(normalize = True).plot(kind='bar', color= ['skyblue','navy'], alpha = 0.9, rot=0)
plt.title('RainTomorrow Indicator No(0) and Yes(1) after Oversampling (Balanced Dataset)')
plt.show()
In [18]:
# Missing Data Pattern in Training Data
import seaborn as sns
sns.heatmap(oversampled.isnull(), cbar=False, cmap='PuBu')
Out[18]:
<Axes: >
In [19]:
total = oversampled.isnull().sum().sort_values(ascending=False)
percent = (oversampled.isnull().sum()/oversampled.isnull().count()).sort_values(ascending=False)
missing = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing.head(4)
Out[19]:
Total Percent
Date 0 0.0
Location 0 0.0
RISK_MM 0 0.0
RainToday 0 0.0
In [20]:
oversampled.select_dtypes(include=['object']).columns
Out[20]:
Index(['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], dtype='object')
In [28]:
# Impute categorical var with Mode
oversampled['Date'] = oversampled['Date'].fillna(oversampled['Date'].mode()[0])
oversampled['Location'] = oversampled['Location'].fillna(oversampled['Location'].mode()[0])
oversampled['WindGustDir'] = oversampled['WindGustDir'].fillna(oversampled['WindGustDir'].mode()[0])
oversampled['WindDir9am'] = oversampled['WindDir9am'].fillna(oversampled['WindDir9am'].mode()[0])
oversampled['WindDir3pm'] = oversampled['WindDir3pm'].fillna(oversampled['WindDir3pm'].mode()[0])
In [29]:
# Convert categorical features to continuous features with Label Encoding
from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in oversampled.select_dtypes(include=['object']).columns:
    lencoders[col] = LabelEncoder()
    oversampled[col] = lencoders[col].fit_transform(oversampled[col])
In [30]:
from sklearn.preprocessing import LabelEncoder
lencoders = {}
for col in oversampled.select_dtypes(include =["object"]).columns:
    lencoderds[col] = LabelEncoder()
    oversampled[col] = lencoders[col].fit_transform(oversampled[col])
In [32]:
import warnings
warnings.filterwarnings("ignore")
# Multiple Imputation by Chained Equations
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
MiceImputed = oversampled.copy(deep=True) 
mice_imputer = IterativeImputer()
MiceImputed.iloc[:, :] = mice_imputer.fit_transform(oversampled)
In [33]:
 
Out[33]:
<pandas.core.indexing._iLocIndexer at 0x1fd21891120>
In [34]:
# Detecting outliers with IQR
Q1 = MiceImputed.quantile(0.25)
Q3 = MiceImputed.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Date             1540.000000
Location           25.000000
MinTemp             9.000000
MaxTemp            10.000000
Rainfall            1.800000
Evaporation         1.669824
Sunshine            1.024853
WindGustDir         9.000000
WindGustSpeed      17.000000
WindDir9am          8.000000
WindDir3pm          8.000000
WindSpeed9am       12.000000
WindSpeed3pm       11.000000
Humidity9am        25.000000
Humidity3pm        29.000000
Pressure9am         8.300000
Pressure3pm         8.400000
Cloud9am            2.562811
Cloud3pm            2.496833
Temp9am             9.100000
Temp3pm             9.400000
RainToday           1.000000
RISK_MM             4.400000
RainTomorrow        1.000000
dtype: float64
In [35]:
# Removing outliers from the dataset
MiceImputed = MiceImputed[~((MiceImputed < (Q1 - 1.5 * IQR)) |(MiceImputed > (Q3 + 1.5 * IQR))).any(axis=1)]
MiceImputed.shape
Out[35]:
(89193, 24)
In [42]:
# Correlation Heatmap
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
corr = MiceImputed.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(20, 20))
cmap = sns.diverging_palette(250, 25, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=None, center=0,square=True, annot=True, linewidths=.5, cbar_kws={"shrink": .9})
Out[42]:
<Axes: >
In [43]:
sns.pairplot( data=MiceImputed, vars=('MaxTemp','MinTemp','Pressure9am','Pressure3pm', 'Temp9am', 'Temp3pm', 'Evaporation'), hue='RainTomorrow' )
Out[43]:
<seaborn.axisgrid.PairGrid at 0x1fd0b9edc30>
In [46]:
# Standardizing data
from sklearn import preprocessing
r_scaler = preprocessing.MinMaxScaler()
r_scaler.fit(MiceImputed)
modified_data = pd.DataFrame(r_scaler.transform(MiceImputed), index=MiceImputed.index, columns=MiceImputed.columns)
In [49]:
# Feature Importance using Filter Method (Chi-Square)
from sklearn.feature_selection import SelectKBest, chi2
X = modified_data.loc[:,modified_data.columns!='RainTomorrow']
y = modified_data[['RainTomorrow']]
selector = SelectKBest(chi2, k=10)
selector.fit(X, y)
X_new = selector.transform(X)
print(X.columns[selector.get_support(indices=True)])
Index(['Rainfall', 'WindGustSpeed', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud3pm', 'Temp3pm', 'RainToday',
       'RISK_MM'],
      dtype='object')
In [51]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier as rf

X = MiceImputed.drop('RainTomorrow', axis=1)
y = MiceImputed['RainTomorrow']
selector = SelectFromModel(rf(n_estimators=100, random_state=0))
selector.fit(X, y)
support = selector.get_support()
features = X.loc[:,support].columns.tolist()
print(features)
print(rf(n_estimators=100, random_state=0).fit(X,y).feature_importances_)
['RISK_MM']
[0.00361265 0.00303465 0.00515607 0.00533627 0.01364907 0.00146548
 0.00254283 0.00249897 0.00957051 0.00263579 0.00331491 0.00245111
 0.0030674  0.00845189 0.04282585 0.01233348 0.01308554 0.00257482
 0.00905154 0.0042079  0.00755055 0.00416249 0.8374202 ]
In [52]:
features = MiceImputed[['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 
                       'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
                       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 
                       'RainToday']]
target = MiceImputed['RainTomorrow']

# Split into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=12345)

# Normalize Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
In [53]:
def plot_roc_cur(fper, tper):  
    plt.plot(fper, tper, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
In [58]:
import time
from sklearn.metrics import accuracy_score, roc_auc_score, cohen_kappa_score, roc_curve, classification_report
def run_model(model, X_train, y_train, X_test, y_test, verbose=True):
    t0=time.time()
    if verbose == False:
        model.fit(X_train,y_train, verbose=0)
    else:
        model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred) 
    coh_kap = cohen_kappa_score(y_test, y_pred)
    time_taken = time.time()-t0
    print("Accuracy = {}".format(accuracy))
    print("ROC Area under Curve = {}".format(roc_auc))
    print("Cohen's Kappa = {}".format(coh_kap))
    print("Time taken = {}".format(time_taken))
    print(classification_report(y_test,y_pred,digits=5))
    
    probs = model.predict_proba(X_test)  
    probs = probs[:, 1]  
    fper, tper, thresholds = roc_curve(y_test, probs) 
    plot_roc_cur(fper, tper)
    
    plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues, normalize = 'all')
    
    return model, accuracy, roc_auc, coh_kap, time_taken
In [59]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

params_lr = {'penalty': 'l1', 'solver':'liblinear'}

model_lr = LogisticRegression(**params_lr)
model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = run_model(model_lr, X_train, y_train, X_test, y_test)

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

params_dt = {'max_depth': 16,
             'max_features': "sqrt"}

model_dt = DecisionTreeClassifier(**params_dt)
model_dt, accuracy_dt, roc_auc_dt, coh_kap_dt, tt_dt = run_model(model_dt, X_train, y_train, X_test, y_test)

# Neural Network
from sklearn.neural_network import MLPClassifier

params_nn = {'hidden_layer_sizes': (30,30,30),
             'activation': 'logistic',
             'solver': 'lbfgs',
             'max_iter': 500}

model_nn = MLPClassifier(**params_nn)
model_nn, accuracy_nn, roc_auc_nn, coh_kap_nn, tt_nn = run_model(model_nn, X_train, y_train, X_test, y_test)

# Random Forest
from sklearn.ensemble import RandomForestClassifier

params_rf = {'max_depth': 16,
             'min_samples_leaf': 1,
             'min_samples_split': 2,
             'n_estimators': 100,
             'random_state': 12345}

model_rf = RandomForestClassifier(**params_rf)
model_rf, accuracy_rf, roc_auc_rf, coh_kap_rf, tt_rf = run_model(model_rf, X_train, y_train, X_test, y_test)

# Light GBM
import lightgbm as lgb
params_lgb ={'colsample_bytree': 0.95, 
         'max_depth': 16, 
         'min_split_gain': 0.1, 
         'n_estimators': 200, 
         'num_leaves': 50, 
         'reg_alpha': 1.2, 
         'reg_lambda': 1.2, 
         'subsample': 0.95, 
         'subsample_freq': 20}

model_lgb = lgb.LGBMClassifier(**params_lgb)
model_lgb, accuracy_lgb, roc_auc_lgb, coh_kap_lgb, tt_lgb = run_model(model_lgb, X_train, y_train, X_test, y_test)

# Catboost
!pip install catboost
import catboost as cb
params_cb ={'iterations': 50,
            'max_depth': 16}

model_cb = cb.CatBoostClassifier(**params_cb)
model_cb, accuracy_cb, roc_auc_cb, coh_kap_cb, tt_cb = run_model(model_cb, X_train, y_train, X_test, y_test, verbose=False)

# XGBoost
import xgboost as xgb
params_xgb ={'n_estimators': 500,
            'max_depth': 16}

model_xgb = xgb.XGBClassifier(**params_xgb)
model_xgb, accuracy_xgb, roc_auc_xgb, coh_kap_xgb, tt_xgb = run_model(model_xgb, X_train, y_train, X_test, y_test)
Accuracy = 0.7309296381003633
ROC Area under Curve = 0.7131732529423768
Cohen's Kappa = 0.43544238478007957
Time taken = 1.1369616985321045
              precision    recall  f1-score   support

           0    0.74249   0.82234   0.78038     12963
           1    0.71002   0.60401   0.65274      9336

    accuracy                        0.73093     22299
   macro avg    0.72626   0.71317   0.71656     22299
weighted avg    0.72890   0.73093   0.72694     22299

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[59], line 7
      4 params_lr = {'penalty': 'l1', 'solver':'liblinear'}
      6 model_lr = LogisticRegression(**params_lr)
----> 7 model_lr, accuracy_lr, roc_auc_lr, coh_kap_lr, tt_lr = run_model(model_lr, X_train, y_train, X_test, y_test)
      9 # Decision Tree
     10 from sklearn.tree import DecisionTreeClassifier

Cell In[58], line 25, in run_model(model, X_train, y_train, X_test, y_test, verbose)
     22 fper, tper, thresholds = roc_curve(y_test, probs) 
     23 plot_roc_cur(fper, tper)
---> 25 plot_confusion_matrix(model, X_test, y_test,cmap=plt.cm.Blues, normalize = 'all')
     27 return model, accuracy, roc_auc, coh_kap, time_taken

NameError: name 'plot_confusion_matrix' is not defined
In [71]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

value = 1.80
width = 0.90

clf1 = LogisticRegression(random_state=12345)
clf2 = DecisionTreeClassifier(random_state=12345) 
clf3 = MLPClassifier(random_state=12345, verbose = 0)
clf4 = RandomForestClassifier(random_state=12345)
clf5 = lgb.LGBMClassifier(random_state=12345, verbose = 0)
clf6 = cb.CatBoostClassifier(random_state=12345, verbose = 0)
clf7 = xgb.XGBClassifier(random_state=12345)
eclf = EnsembleVoteClassifier(clfs=[clf4, clf5, clf6, clf7], weights=[1, 1, 1, 1], voting='soft')

X_list = MiceImputed[["Sunshine", "Humidity9am", "Cloud3pm"]] #took only really important features
X = np.asarray(X_list, dtype=np.float32)
y_list = MiceImputed["RainTomorrow"]
y = np.asarray(y_list, dtype=np.int32)

# Plotting Decision Regions
gs = gridspec.GridSpec(3,3)
fig = plt.figure(figsize=(18, 14))

labels = ['Logistic Regression',
          'Decision Tree',
          'Neural Network',
          'Random Forest',
          'LightGBM',
          'CatBoost',
          'XGBoost',
          'Ensemble']

for clf, lab, grd in zip([clf1, clf2, clf3, clf4, clf5, clf6, clf7, eclf],
                         labels,
                         itertools.product([0, 1, 2],
                         repeat=2)):
    clf.fit(X, y)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=X, y=y, clf=clf, 
                                filler_feature_values={2: value}, 
                                filler_feature_ranges={2: width}, 
                                legend=2)
    plt.title(lab)

plt.show()
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001348 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
In [61]:
pip install lightgbm
Collecting lightgbmNote: you may need to restart the kernel to use updated packages.

  Downloading lightgbm-4.0.0-py3-none-win_amd64.whl (1.3 MB)
     ---------------------------------------- 1.3/1.3 MB 3.0 MB/s eta 0:00:00
Requirement already satisfied: numpy in d:\users\cheshi\anaconda4\lib\site-packages (from lightgbm) (1.25.0)
Requirement already satisfied: scipy in d:\users\cheshi\anaconda4\lib\site-packages (from lightgbm) (1.11.1)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.0.0
In [63]:
pip install catboost
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-win_amd64.whl (101.0 MB)
     -------------------------------------- 101.0/101.0 MB 3.9 MB/s eta 0:00:00
Collecting plotly
  Downloading plotly-5.15.0-py2.py3-none-any.whl (15.5 MB)
     ---------------------------------------- 15.5/15.5 MB 2.4 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.16.0 in d:\users\cheshi\anaconda4\lib\site-packages (from catboost) (1.25.0)
Requirement already satisfied: six in d:\users\cheshi\anaconda4\lib\site-packages (from catboost) (1.16.0)
Requirement already satisfied: pandas>=0.24 in d:\users\cheshi\anaconda4\lib\site-packages (from catboost) (2.0.3)
Requirement already satisfied: matplotlib in d:\users\cheshi\anaconda4\lib\site-packages (from catboost) (3.7.2)
Requirement already satisfied: scipy in d:\users\cheshi\anaconda4\lib\site-packages (from catboost) (1.11.1)
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
     ---------------------------------------- 47.0/47.0 kB 1.2 MB/s eta 0:00:00
Requirement already satisfied: tzdata>=2022.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=0.24->catboost) (2023.3)
Requirement already satisfied: pytz>=2020.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=0.24->catboost) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=0.24->catboost) (2.8.2)
Requirement already satisfied: cycler>=0.10 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: contourpy>=1.0.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (1.1.0)
Requirement already satisfied: pillow>=6.2.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (9.4.0)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (1.4.4)
Requirement already satisfied: packaging>=20.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (23.0)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib->catboost) (4.41.0)
Collecting tenacity>=6.2.0
  Using cached tenacity-8.2.2-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2 graphviz-0.20.1 plotly-5.15.0 tenacity-8.2.2
Note: you may need to restart the kernel to use updated packages.
In [65]:
pip install xgboost
Collecting xgboost
  Downloading xgboost-1.7.6-py3-none-win_amd64.whl (70.9 MB)
     ---------------------------------------- 70.9/70.9 MB 3.6 MB/s eta 0:00:00
Requirement already satisfied: scipy in d:\users\cheshi\anaconda4\lib\site-packages (from xgboost) (1.11.1)
Requirement already satisfied: numpy in d:\users\cheshi\anaconda4\lib\site-packages (from xgboost) (1.25.0)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.6
Note: you may need to restart the kernel to use updated packages.
In [67]:
pip install mlxtend
Collecting mlxtend
  Downloading mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
     ---------------------------------------- 1.4/1.4 MB 6.6 MB/s eta 0:00:00
Requirement already satisfied: setuptools in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (65.6.3)
Requirement already satisfied: matplotlib>=3.0.0 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (3.7.2)
Requirement already satisfied: joblib>=0.13.2 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (1.3.1)
Requirement already satisfied: numpy>=1.16.2 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (1.25.0)
Requirement already satisfied: scikit-learn>=1.0.2 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (1.3.0)
Requirement already satisfied: pandas>=0.24.2 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (2.0.3)
Requirement already satisfied: scipy>=1.2.1 in d:\users\cheshi\anaconda4\lib\site-packages (from mlxtend) (1.11.1)
Requirement already satisfied: kiwisolver>=1.0.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (1.4.4)
Requirement already satisfied: fonttools>=4.22.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (4.41.0)
Requirement already satisfied: pillow>=6.2.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (9.4.0)
Requirement already satisfied: cycler>=0.10 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (0.11.0)
Requirement already satisfied: packaging>=20.0 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (23.0)
Requirement already satisfied: python-dateutil>=2.7 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (2.8.2)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (3.0.9)
Requirement already satisfied: contourpy>=1.0.1 in d:\users\cheshi\anaconda4\lib\site-packages (from matplotlib>=3.0.0->mlxtend) (1.1.0)
Requirement already satisfied: tzdata>=2022.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=0.24.2->mlxtend) (2023.3)
Requirement already satisfied: pytz>=2020.1 in d:\users\cheshi\anaconda4\lib\site-packages (from pandas>=0.24.2->mlxtend) (2022.7)
Requirement already satisfied: threadpoolctl>=2.0.0 in d:\users\cheshi\anaconda4\lib\site-packages (from scikit-learn>=1.0.2->mlxtend) (3.1.0)
Requirement already satisfied: six>=1.5 in d:\users\cheshi\anaconda4\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.0.0->mlxtend) (1.16.0)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.22.0
Note: you may need to restart the kernel to use updated packages.